Read text file line by line and extract first layer main feature

In [49]:
import json
import pandas as pd

file_sensor = 'sensorswww_data.txt'

# read data from raw file (mode='r' -> read)
f = open(file_sensor,encoding='utf-8',mode='r')

# read all lines and saved as array
lines = f.readlines()

print ("Total records in sensor raw data: %d \n" %len(lines))
print ("First Line: \n")
print (lines[0])
Total records in sensor raw data: 75092 

First Line: 

{"distinct_id":"595466e9a8e733434ce08de16e927d985e0b5d48","lib":{"$lib":"js","$lib_method":"code","$lib_version":"1.6.20"},"properties":{"$os":"windows","$model":"pc","$os_version":"6.1","$screen_height":800,"$screen_width":1280,"$lib":"js","$lib_version":"1.6.20","$browser":"chrome","$browser_version":"56","$latest_referrer":"","$latest_referrer_host":"","$latest_utm_source":"baidu","$latest_utm_medium":"cpc","$latest_utm_campaign":"通用词","$latest_utm_content":"通用-用户画像","$latest_utm_term":"用户画像","_latest_ch":"demo","_session_referrer":"https://www.baidu.com/baidu.php","_session_referrer_host":"www.baidu.com","session_page_url":"https://www.sensorsdata.cn/?utm_source=baidu&utm_medium=cpc&utm_term=%E7%94%A8%E6%88%B7%E7%94%BB%E5%83%8F&utm_content=%E9%80%9A%E7%94%A8%2D%E7%94%A8%E6%88%B7%E7%94%BB%E5%83%8F&utm_campaign=%E9%80%9A%E7%94%A8%E8%AF%8D","pageUrl":"https://sensorsdata.cn/?ch=demo","pageStayTime":5.692,"pagePosition":2,"$is_first_day":true,"$is_first_time":false,"$ip":"219.135.131.99"},"type":"track","event":"index_leave","_nocache":"0654392402996","time":1488791047953}

In [50]:
# set: unorder and non-duplicate value
main_features=set()

print ("Main Features: \n")

# items() returns a list of dict (key,value) tuple pairs
for i in range(len(lines)):
    line_json = json.loads(lines[i])
    for features,value in line_json.items():
        main_features.add(features)

print (main_features,"\n")

print ("In the sensor data, we have %d main features." %len(main_features))
Main Features: 

{'distinct_id', 'type', 'lib', 'time', '_nocache', 'event', 'properties', 'jssdk_error'} 

In the sensor data, we have 8 main features.
In [51]:
# Use data type to judge if any main feature is necessary to deep dig in.  
main_features_type = set()

for i in range(len(lines)):
    line_json = json.loads(lines[i])
    for feature,value in line_json.items():
        main_features_type.add((feature,type(line_json[feature])))

for j in main_features_type:
    print (j)
('distinct_id', <class 'str'>)
('properties', <class 'dict'>)
('time', <class 'int'>)
('_nocache', <class 'str'>)
('jssdk_error', <class 'str'>)
('type', <class 'str'>)
('lib', <class 'dict'>)
('event', <class 'str'>)

Conclusion: lib and properties with dict type have deeper layers and must be studied more.

Learn features with sub layers: lib & properties

In [52]:
# lib
lib_subfeatures = set()
lib_subfeatures_type = set()

for i in range(len(lines)):
    line_json = json.loads(lines[i])
    for feature,value in line_json["lib"].items():
        lib_subfeatures.add(feature)
        lib_subfeatures_type.add((feature,type(line_json["lib"][feature])))
        
for j in lib_subfeatures_type:
    print (j,"\n")
    
print ("There are %d sub featuress in lib." %len(lib_subfeatures))
('$lib_version', <class 'str'>) 

('$lib_method', <class 'str'>) 

('$lib', <class 'str'>) 

There are 3 sub featuress in lib.
In [58]:
#properties
properties_subfeatures = set()
properties_subfeatures_type = set()

for i in range(len(lines)):
    line_json = json.loads(lines[i])
    for feature,value in line_json["properties"].items():
        properties_subfeatures.add(feature)
        properties_subfeatures_type.add((feature,type(line_json["properties"][feature])))

for j in properties_subfeatures_type:
    print (j)

print ("\n There are %d sub features in properties. \n" %len(properties_subfeatures))                                   
        
('_session_referrer_domain', <class 'str'>)
('$ip', <class 'str'>)
('$is_first_time', <class 'bool'>)
('$latest_referrer_host', <class 'str'>)
('$screen_width', <class 'int'>)
('$first_visit_time', <class 'str'>)
('$latest_utm_content', <class 'str'>)
('info', <class 'str'>)
('contact', <class 'str'>)
('$first_referrer', <class 'str'>)
('$utm_content', <class 'str'>)
('$referrer_host', <class 'str'>)
('result', <class 'str'>)
('$os_version', <class 'str'>)
('$latest_utm_medium', <class 'str'>)
('$latest_utm_term', <class 'str'>)
('_latest_ch', <class 'str'>)
('url_path', <class 'str'>)
('isSuccess', <class 'bool'>)
('$model', <class 'str'>)
('pageStayTime', <class 'float'>)
('$utm_term', <class 'str'>)
('site_url', <class 'str'>)
('isMsg', <class 'bool'>)
('referrerUrl', <class 'str'>)
('$first_browser_language', <class 'str'>)
('$browser', <class 'str'>)
('requestBtn', <class 'str'>)
('$latest_utm_campaign', <class 'str'>)
('$utm_source', <class 'str'>)
('pageUrl', <class 'str'>)
('pagePosition', <class 'int'>)
('$url', <class 'str'>)
('$latest_referrer', <class 'str'>)
('company', <class 'str'>)
('$os', <class 'str'>)
('_session_from_url', <class 'str'>)
('$referrer', <class 'str'>)
('name', <class 'str'>)
('$first_referrer_host', <class 'str'>)
('page', <class 'str'>)
('$lib_version', <class 'str'>)
('verification_code', <class 'str'>)
('$screen_height', <class 'int'>)
('_session_referrer', <class 'str'>)
('email', <class 'str'>)
('from_url', <class 'str'>)
('$url_path', <class 'str'>)
('ch', <class 'str'>)
('pageStayTime', <class 'int'>)
('project_name', <class 'str'>)
('$utm_campaign', <class 'str'>)
('$lib', <class 'str'>)
('$utm_medium', <class 'str'>)
('_session_referrer_host', <class 'str'>)
('$latest_utm_source', <class 'str'>)
('siteUrl', <class 'str'>)
('$is_first_day', <class 'bool'>)
('referrHostUrl', <class 'str'>)
('$browser_version', <class 'str'>)
('session_page_url', <class 'str'>)
('$title', <class 'str'>)

 There are 61 sub features in properties. 

Create a list of dictionaries and then convert to dataframe

In [38]:
from pandas.io.json import json_normalize

dict_sensor_raw = [] # create a list to store all the records (type: dictionry)

for i in range(len(lines)):              
    line_json = json.loads(lines[i])
    dict_sensor_raw.append(line_json)

df_sensor_raw = json_normalize(dict_sensor_raw)
In [62]:
# json_normalize: normalize semi-structured json data into a flat table
from pandas.io.json import json_normalize

dict_sensor = []

for i in range(len(lines)):
    line_json = json.loads(lines[i])
    dict_sensor.append(line_json)
    
df_sensor = json_normalize(dict_sensor)
df_sensor.head()
Out[62]:
_nocache distinct_id event jssdk_error lib.$lib lib.$lib_method lib.$lib_version properties.$browser properties.$browser_version properties.$first_browser_language ... properties.referrerUrl properties.requestBtn properties.result properties.session_page_url properties.siteUrl properties.site_url properties.url_path properties.verification_code time type
0 0654392402996 595466e9a8e733434ce08de16e927d985e0b5d48 index_leave NaN js code 1.6.20 chrome 56 NaN ... NaN NaN NaN https://www.sensorsdata.cn/?utm_source=baidu&u... NaN NaN NaN NaN 1488791047953 track
1 3040562711955 9939d3e087bca29c42334d96dccd25ca0e06652a NaN NaN js code 1.6.20 NaN NaN zh-CN ... NaN NaN NaN NaN NaN NaN NaN NaN 1490958296645 profile_set_once
2 9587552771961 9939d3e087bca29c42334d96dccd25ca0e06652a $pageview NaN js code 1.6.20 chrome 56 NaN ... NaN NaN NaN https://sensorsdata.cn/?ch=demo NaN NaN NaN NaN 1488791050856 track
3 0652937076129 9939d3e087bca29c42334d96dccd25ca0e06652a btnClick NaN js code 1.6.20 chrome 56 NaN ... NaN 2 NaN https://sensorsdata.cn/?ch=demo NaN NaN NaN NaN 1488791051772 track
4 8207407748558 9939d3e087bca29c42334d96dccd25ca0e06652a btnClick NaN js code 1.6.20 chrome 56 NaN ... NaN 2 NaN https://sensorsdata.cn/?ch=demo NaN NaN NaN NaN 1488791056032 track

5 rows × 70 columns

In [63]:
df_sensor.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75092 entries, 0 to 75091
Data columns (total 70 columns):
_nocache                               75092 non-null object
distinct_id                            75092 non-null object
event                                  65673 non-null object
jssdk_error                            58 non-null object
lib.$lib                               75092 non-null object
lib.$lib_method                        75092 non-null object
lib.$lib_version                       75092 non-null object
properties.$browser                    65673 non-null object
properties.$browser_version            65673 non-null object
properties.$first_browser_language     9148 non-null object
properties.$first_referrer             9369 non-null object
properties.$first_referrer_host        9369 non-null object
properties.$first_visit_time           9419 non-null object
properties.$ip                         65673 non-null object
properties.$is_first_day               65673 non-null object
properties.$is_first_time              65673 non-null object
properties.$latest_referrer            64862 non-null object
properties.$latest_referrer_host       64862 non-null object
properties.$latest_utm_campaign        39903 non-null object
properties.$latest_utm_content         38763 non-null object
properties.$latest_utm_medium          39691 non-null object
properties.$latest_utm_source          40583 non-null object
properties.$latest_utm_term            39095 non-null object
properties.$lib                        65673 non-null object
properties.$lib_version                65673 non-null object
properties.$model                      65673 non-null object
properties.$os                         65673 non-null object
properties.$os_version                 65673 non-null object
properties.$referrer                   32334 non-null object
properties.$referrer_host              32335 non-null object
properties.$screen_height              65673 non-null float64
properties.$screen_width               65673 non-null float64
properties.$title                      32620 non-null object
properties.$url                        32598 non-null object
properties.$url_path                   32620 non-null object
properties.$utm_campaign               13903 non-null object
properties.$utm_content                13766 non-null object
properties.$utm_medium                 13896 non-null object
properties.$utm_source                 14047 non-null object
properties.$utm_term                   13801 non-null object
properties._latest_ch                  8537 non-null object
properties._session_from_url           114 non-null object
properties._session_referrer           63046 non-null object
properties._session_referrer_domain    116 non-null object
properties._session_referrer_host      63039 non-null object
properties.ch                          602 non-null object
properties.company                     1304 non-null object
properties.contact                     1304 non-null object
properties.email                       1304 non-null object
properties.from_url                    982 non-null object
properties.info                        1163 non-null object
properties.isMsg                       469 non-null object
properties.isSuccess                   791 non-null object
properties.name                        15170 non-null object
properties.page                        11716 non-null object
properties.pagePosition                16490 non-null float64
properties.pageStayTime                16720 non-null float64
properties.pageUrl                     30356 non-null object
properties.project_name                2534 non-null object
properties.referrHostUrl               469 non-null object
properties.referrerUrl                 469 non-null object
properties.requestBtn                  4271 non-null object
properties.result                      1163 non-null object
properties.session_page_url            62766 non-null object
properties.siteUrl                     469 non-null object
properties.site_url                    982 non-null object
properties.url_path                    230 non-null object
properties.verification_code           1304 non-null object
time                                   75092 non-null int64
type                                   75092 non-null object
dtypes: float64(4), int64(1), object(65)
memory usage: 40.1+ MB

Save dataframe as a CSV file

In [55]:
df_sensor_raw.to_csv('../data/sensors.csv', index= False, encoding='utf-8')
In [64]:
path = 'sensorswww.csv'
df_sensor.to_csv(path,index=False,encoding='utf-8')